home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /* Change log:
- * $Log: irtfiles.c,v $
- * Revision 1.32 92/05/06 17:32:14 jonathan
- * Added new global for current_filename and current_filecount (from
- * riddle@rice.edu).
- *
- * Revision 1.31 92/04/30 12:25:09 jonathan
- * changed a couple of s_free's to free's for ULTRIX CC.
- *
- * Revision 1.30 92/04/29 08:09:55 shen
- * add global variable "_indexable_section", default is true
- *
- * Revision 1.29 92/04/28 17:53:24 jonathan
- * Replaced directory routines with scandir.
- *
- * Revision 1.28 92/03/20 11:02:55 jonathan
- * Added code to handle switches for word_pairs and word_postition info.
- *
- * Revision 1.27 92/02/13 11:23:21 jonathan
- * Removed printable_time() from index logging, since it's done by waislog.
- *
- * Revision 1.26 92/02/12 13:31:29 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /*
- * Indexes the words in a text file.
- *
- * Port of irtfiles.lisp.
- *
- * -brewster 6/90
- */
-
- /* the main functions are:
- * index_text_file
- * index_directory
- *
- * Some of the policy issues coded in this file are
- * What extra weight should the headline get?
- *
- */
-
- #include <ctype.h>
- #include <string.h>
- #include "panic.h"
- #include "irdirent.h"
- #include "irhash.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
-
- #ifndef THINK_C
- #include <sys/types.h>
- #include <sys/stat.h>
- #endif /* ndef THINK_C */
-
- #define MAX_LINE_LENGTH 1000 /* characters */
- #define extra_weight_for_header 10
-
- #ifdef UNIX
- #define PRINT_AS_INDEXING true /* also defined in irfiles.c */
- #else
- #define PRINT_AS_INDEXING false
- #endif
-
- char* header_flag_1;
- char* header_flag_2;
- long len_of_files_since_last_delete = 0;
- long len_of_files_since_last_flush = 0;
- long total_indexed_file_length = 0;
-
- boolean indexingForBeta = false;
-
- long _indexable_section = 1;
-
- char *current_filename = NULL;
- int current_filecount = 0;
-
- boolean index_contents = true;
- boolean filter_contents;
- char filter_program[MAX_LINE_LENGTH];
-
- /* Handling Word Pairs */
-
- /* makes a word_pair out of a two words:
- make_joint_word("abcdefghijklmnopqrstuvwxyz", "123456789012345678901");
- "abcdefghij1234567890"
- make_joint_word("abcdefghijkl", "123");
- "abcdefghij123"
- make_joint_word("abc", "123");
- "abc123" */
-
- char *make_joint_word(word1, word2)
- char* word1;
- char* word2;
- {
- static char new_word[MAX_WORD_LENGTH + 1];
- strncpy(new_word, word1, MAX_WORD_LENGTH / 2);
- strncpy(new_word + MIN(MAX_WORD_LENGTH / 2, strlen(word1)),
- word2, MAX_WORD_LENGTH - (MAX_WORD_LENGTH / 2));
- return(new_word);
- }
-
- /* returns 0 is successful, non-0 if error */
- static long add_word_before_pairs _AP((char *word, long char_pos,
- long line_pos, long weight,
- long doc_id, time_t date,
- boolean capitalized, database* db,
- boolean word_position, boolean word_pairs));
-
- static long
- add_word_before_pairs(word, char_pos, line_pos,
- weight, doc_id, date, capitalized, db,
- word_position, word_pairs)
- char *word; /* the word to be indexed, this could be a
- word pair. If NULL there are no more words
- to be indexed */
- long char_pos; /* the position of the start of the
- word */
- long line_pos; /* this is passed for the best
- section calculation */
- long weight; /* how important the word looks
- syntactically (such as is it bold)
- NOT used by signature system */
- long doc_id; /* current document, this will never be 0 */
- time_t date; /* display day of this document, 0 if not known */
- boolean capitalized; /* if the word started with a cap */
- database* db; /* database to insert the document */
- boolean word_position; /* if true, include word position in index. */
- boolean word_pairs; /* if true, add pairs of capitalized words */
- {
- static char last_word[MAX_WORD_LENGTH + 1];
- static long last_doc_id = -1;
- /* The way it works is it remembers if the last word if it was
- capitalized (if not it clears the saved word).
- If another capitalized word comes along next
- (and it is in the same document), then it makes a joint word and calls
- add_word with it.
-
- This does not throw away stopwords before forming pairs, so it will
- not be quite what CMDRS does. This should only be used in seeker
- and serial searching before proximity is used.
-
- */
- if(capitalized && word_pairs){
- if(last_word[0] != '\0' && last_doc_id == doc_id){
- add_word(make_joint_word(last_word, word),
- char_pos, line_pos, weight, doc_id, date, 1L, db);
- }
- else{
- last_word[0] = '\0';
- }
- strncpy(last_word, word, MAX_WORD_LENGTH);
- last_doc_id = doc_id;
- }
- else{ /* not capitalized or word_pairs is false */
- last_word[0] = '\0';
- }
- return(add_word(word, char_pos, line_pos, weight, doc_id, date, 0L, db, word_position));
- }
-
-
- #ifdef NOTUSED
- #define WORD_LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
-
-
- static char *new_word _AP((char* line,char* word));
-
- static char *new_word(line,word)
- char *line;
- char *word;
- {
- /* This copies the first word from line into word while downcasing it.
- It returns a pointer into line that is after the word,
- which can be used to call this function again.
- If there are no words left, then NULL is returned,
- and word is length 0.
- There has got to be a better way.
- */
- long i = 0;
- char *beginning_ptr = strpbrk(line, WORD_LETTERS);
- char *next_word;
- long length;
- if(NULL == beginning_ptr){
- word[0] = '\0';
- return(NULL);
- }
- length = strspn(beginning_ptr, WORD_LETTERS);
- next_word = length + beginning_ptr;
-
- length = MIN(MAX_WORD_LENGTH,length);
- for(i=0; i<length; i++){
- word[i] = char_downcase((unsigned long)*beginning_ptr++);
- }
- word[i] = '\0';
- return(next_word);
- }
-
- static boolean reasonable_word _AP((char* word));
-
- static boolean reasonable_word(word)
- char* word;
- /* this should be more sophisticated */
- {
- if(strlen(word) > 1){
- return(TRUE);
- }
- else{
- return(FALSE);
- }
- }
-
- #endif /* def NOTUSED */
-
-
- /* MAPPING A FUNCTION OVER WORDS (QUICKLY) */
-
-
- /* map_over_words("foo bar baz", 0L, 1L, 0L, &integer, false, db, dummy_wordfunction) */
- static long dummy_wordfunction(word, char_pos, line_pos,
- weight, doc_id, date, capitalized, db)
- char *word; /* the word to be indexed, this could be a
- word pair. If NULL there are no more words
- to be indexed */
- long char_pos; /* the position of the start of the
- word */
- long line_pos; /* this is passed for the best
- section calculation */
- long weight; /* how important the word looks
- syntactically (such as is it bold)
- NOT used by signature system */
- long doc_id; /* current document, this will never be 0 */
- time_t date; /* display day of this document, 0 if not known */
- boolean capitalized; /* if the word started with a cap */
- database* db; /* database to insert the document */
- {
- if(word != NULL)
- printf("word: %s, char_pos: %ld\n", word, char_pos);
- return(0);
- }
-
- /* returns the number of words added, or -1 if an error occurred */
- long map_over_words(line,
- document_id,
- weight,
- file_position_before_line,
- line_length,
- newline_terminated,
- db,
- wordfunction,
- word_position, word_pairs)
- char* line;
- long document_id;
- long weight;
- long file_position_before_line;
- long *line_length;
- boolean *newline_terminated;
- database* db;
- wordfunc *wordfunction;
- boolean word_position, word_pairs;
- {
- /* Add words to the index if it should be done.
- * Returns the number of words added.
- * Should it return the amount of weight added?
- * The line length is side effected with the length of the line.
- * Newline_terminated is set based on whether the last character
- * in the string was a newline. If it was not, then it fgets probably
- * did not retrieve the whole line.
- */
-
- long position_in_word = 0;
- long word_count = 0;
- char word[MAX_WORD_LENGTH + 1];
- unsigned long ch;
- long char_count = 0;
- boolean capitalized = false; /* if the word starts with a cap */
-
- for(ch = (unsigned char)line[char_count++];
- ch != '\0'; ch = (unsigned char)line[char_count++]){
- boolean alnum = isalnum(ch);
- if(alnum){
- /* put the character in the word if not too long */
- if(position_in_word == 0)
- capitalized = isupper((unsigned long)ch)?true:false;
- if(position_in_word < MAX_WORD_LENGTH){
- word[position_in_word++] = char_downcase((unsigned long)ch);
- }
- }
- else{ /* not an in a word */
- if(position_in_word != 0){
- /* then we have collected a word */
- if(position_in_word > 1){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- if(0 !=
- (*wordfunction)(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db,
- word_position,
- word_pairs))
- return(-1); /* error */
- word_count++;
- }
- position_in_word = 0;
- }
- }
- }
- /* finish last word */
- if(position_in_word > 1){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- if(0 != (*wordfunction)(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db,
- word_position, word_pairs))
- return(-1);
- word_count++;
- }
-
- /* for debugging
- if(char_count - 1 != strlen(line)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "char_count: %ld, strlen: %ld", char_count, strlen(line));
- }
- */
- if(newline_terminated != NULL){
- if('\n' != line[char_count-2])
- *newline_terminated = false;
- else
- *newline_terminated = true;
- }
- if(line_length != NULL)
- *line_length = char_count - 1;
- return(word_count);
- }
-
-
- static long add_words_if_appropriate
- _AP((char* line,long document_id,long weight,long file_position_before_line,
- long* line_length,boolean* newline_terminated,database* db,
- boolean word_position, boolean word_pairs));
-
- static long
- add_words_if_appropriate(line,
- document_id,
- weight,
- file_position_before_line,
- line_length,
- newline_terminated,
- db,
- word_position, word_pairs)
- char* line;
- long document_id;
- long weight;
- long file_position_before_line;
- long *line_length;
- boolean *newline_terminated;
- database* db;
- boolean word_position, word_pairs;
- {
- /* Add words to the index if it should be done.
- * Returns the number of words added.
- * Should it return the amount of weight added?
- * The line length is side effected with the length of the line.
- * Newline_terminated is set based on whether the last character
- * in the string was a newline. If it was not, then it fgets probably
- * did not retrieve the whole line.
- */
-
- long position_in_word = 0;
- long word_count = 0;
- char word[MAX_WORD_LENGTH + 1];
- unsigned long ch;
- long char_count = 0;
- boolean capitalized = false; /* if the word starts with a cap */
-
- for(ch = (unsigned char)line[char_count++];
- ch != '\0'; ch = (unsigned char)line[char_count++]){
- boolean alnum = isalnum(ch);
- if(alnum){
- /* put the character in the word if not too long */
- if(position_in_word == 0)
- capitalized = isupper((unsigned long)ch)?true:false;
- if(position_in_word < MAX_WORD_LENGTH){
- word[position_in_word++] = char_downcase((unsigned long)ch);
- }
- }
- else{ /* not an in a word */
- if(position_in_word != 0){
- /* then we have collected a word */
- if(position_in_word > 1){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- add_word_before_pairs(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db,
- word_position, word_pairs);
- word_count++;
- }
- position_in_word = 0;
- }
- }
- }
- /* finish last word */
- if(position_in_word > 1){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- add_word(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- 0L,
- db);
- word_count++;
- }
-
- /* for debugging
- if(char_count - 1 != strlen(line)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "char_count: %ld, strlen: %ld", char_count, strlen(line));
- }
- */
- if('\n' != line[char_count-2])
- *newline_terminated = false;
- else
- *newline_terminated = true;
-
- *line_length = char_count - 1;
- return(word_count);
- }
-
- static int nodecompare _AP((unsigned long* i,unsigned long* j));
-
- static int
- nodecompare(i,j)
- unsigned long *i, *j;
- {
- if (i[0] < j[0])
- return(-1);
- else if (i[0] > j[0])
- return(1);
- else
- return(0);
- }
-
- #define nodeRange 256 /* 2048 sprint nodes on a full sized machine - should
- be passed in */
- #define iterations_to_reorder 50 /* 1 is best but slow */
-
- static void finish_document
- _AP((char* header,char* line,long document_id,
- document_table_entry* the_document_table_entry,
- long file_position_before_line,
- long file_position_before_document,database* db,
- boolean word_position, boolean word_pairs));
-
- static void
- finish_document(header,line,document_id,the_document_table_entry,
- file_position_before_line, file_position_before_document,
- db, word_position, word_pairs)
- char* header;
- char* line;
- long document_id;
- document_table_entry* the_document_table_entry;
- long file_position_before_line;
- long file_position_before_document;
- database* db;
- boolean word_position, word_pairs;
- { long line_length;
- boolean newline_terminated;
- if(0 != strlen(header)){
- /* add weights for the header (if there was one) */
- long number_of_words =
- map_over_words(header, document_id,
- extra_weight_for_header,
- file_position_before_line-
- file_position_before_document,
- &line_length,
- &newline_terminated,
- db,
- add_word_before_pairs,
- word_position, word_pairs);
- if(number_of_words == -1)
- waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed");
- db->total_word_count += number_of_words;
- the_document_table_entry->document_length += number_of_words;
- }
-
- /* store out the document header here */
- the_document_table_entry->headline_id =
- write_headline_table_entry(header, db);
- if(NULL == line)
- { /* EOF */
- /* if it goes to the end of the file, then
- * set the end_character to 0 so that it is clear that
- * it goes to the end of the file.
- */
- the_document_table_entry->end_character = 0;
- }
- else /* set the end_character */
- the_document_table_entry->end_character = file_position_before_line;
-
-
- /*
- waislog("start char: %ld, end char: %ld",
- the_document_table_entry->start_character,
- the_document_table_entry->end_character);
- */
-
- if (indexingForBeta)
- { /* we need to decide which sprint node this doc will go in.
- for now we will store the sn in the date field, but that
- is temporary
- NOTE that we must subract 1 from document_id, since we want
- a 0 based number
- */
- static unsigned long* nodes = NULL; /* size/node# inited to 0 to 2047 */
- static long minPos;
- unsigned long size;
-
- if (nodes == NULL)
- { long i;
- long startPos;
- time_t temp_time;
-
- nodes = (unsigned long*)s_malloc(sizeof(unsigned long)*nodeRange*2);
- srand((int)time(&temp_time)); /* try to distribute the entries */
- startPos = rand() % nodeRange; /* for indexes with < nodeRng docs */
- for (i = 0; i < nodeRange; i++)
- { nodes[(i * 2) + 1] = (i + startPos) % nodeRange;
- nodes[i * 2] = 0;
- }
- minPos = 0;
- /*printf("init: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- NL();*/
- }
-
- /* place the document in the emptiest node (at minPos) */
- the_document_table_entry->date = (time_t)nodes[(minPos * 2) + 1];
-
- /* increment the size to account for document */
- size = nodes[minPos * 2];
- size += (the_document_table_entry->end_character -
- the_document_table_entry->start_character);
- nodes[minPos * 2] = size;
-
- if ((the_document_table_entry->end_character -
- the_document_table_entry->start_character) > 100000)
- printf("big doc %lu %s\n",the_document_table_entry->end_character - the_document_table_entry->start_character,header);
-
- minPos++;
-
- /* possibly reorder it */
- if (minPos > iterations_to_reorder)
- {
- long i;
- minPos = 0;
- /*printf("before: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- NL();*/
- qsort((char*)nodes,nodeRange,sizeof(unsigned long) * 2,nodecompare);
- /*printf("after: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- NL();*/
- printf("just sorted nodes, min: ");
- for (i = 0; i < 10; i++)
- printf("%lu ",nodes[i * 2]);
- printf(", max: %lu/%lu\n",nodes[(nodeRange * 2)-2],nodes[(nodeRange * 2)-1]);
- }
-
-
-
- #ifdef old
- sn = (document_id - 1) % 2048; /* 2048 = sn's in a full machine */
-
- /* should also take into account the "fullness" of any particular
- node */
- the_document_table_entry->date = (time_t)sn;
- /* waislog(WLOG_LOW, WLOG_INFO,
- "put %s in sprint node %ld",header,sn);*/
- #endif /* def old */
- }
-
- write_document_table_entry(the_document_table_entry, db);
- cprintf(PRINT_AS_INDEXING, ".");
- total_indexed_file_length = /* set this so the speed looks right */
- total_indexed_file_length + file_position_before_line;
- total_indexed_file_length = /* set it back */
- total_indexed_file_length - file_position_before_line;
- }
-
- #define LENGTH_OF_NEWLINE 1 /* this will be 2 on a PC, I think */
-
- FILE* input_stream = 0;
-
- void index_text_file(filename,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs)
- char* filename;
- boolfunc *separator_function;
- voidfunc *header_function;
- longfunc *date_function;
- voidfunc *finish_header_function;
- char *type;
- database* db;
- boolean check_for_text_file;
- boolean check_for_file_already_indexed;
- boolean word_position, word_pairs;
- {
- /* Addes words to the index for a given file.
- * The function arguments can be NULL which means it would
- * always answer NULL.
- * separator_function is called on every line to see if it
- * separates documents.
- * header_function is called on every line so that a headline
- * can be accumulated. This assumes that it will side effect global
- * variables.
- * finish_header_function is called when the document is finished
- * (by separator function responding TRUE or EOF) this will return
- * the headline string or NULL.
- * Presumably finish_header_function will use the
- * effects of header_function. finish_header_function
- * will only be called once, so it should clear whatever state
- * header_function has set.
- * if check_for_text_file then it looks to see if first character
- * in the file is a printable character.
- * if check_for_file_already_indexed then it looks through the filename
- * file to see if the file has not been indexed. If it has,
- * then it is checked to see if it is up-to-date. (it does not
- * kill the old entry (maybe it should)).
- */
-
- long filename_id;
- document_table_entry the_document_table_entry;
- long document_id = next_document_id(db);
- long file_position_before_line = 0;
- long file_position_before_document = 0;
- long date;
- char header[MAX_LINE_LENGTH];
- char line[MAX_LINE_LENGTH];
- char newtype[MAX_LINE_LENGTH];
- long file_size = 0;
-
- input_stream = s_fopen(filename, "r");
-
- if(NULL == input_stream){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "File %s does not exist", filename);
- /* then the is not a valid file to be indexed */
- return;
- }
- if(check_for_file_already_indexed){
- time_t time;
- char full_path[MAX_FILENAME_LEN];
- truename(filename, full_path);
- if(true == filename_in_database(full_path, type, &time, db)){
- /* check that it is the same time as this file */
- if(time == file_write_date(filename)){
- waislog(WLOG_HIGH, WLOG_INDEX,
- "File %s already indexed", filename);
- s_fclose(input_stream);
- return;
- }
- }
- }
-
- /* Make the current filename accessible via global variables.
- * Increment current_filecount so routines can efficiently detect
- * changes in the current file.
- * -- Prentiss Riddle, Rice ONCS, riddle@rice.edu, 5/6/92
- */
-
- if(current_filename == NULL) current_filename = s_malloc(MAX_FILENAME_LEN+1);
-
- strncpy(current_filename, filename, MAX_FILENAME_LEN);
- current_filecount++;
-
- if(check_for_text_file){
- /* if we need this to be a text file, check the first character
- for a printable character */
- long ch = fgetc(input_stream);
- /* printf("First character is '%c'\n", ch); */
- if(EOF == ch || (!isprint(ch) && !isspace(ch))){
- s_fclose(input_stream);
- return;
- }
- ungetc(ch, input_stream);
- }
-
- header[0] = '\0'; /* set it to the empty string */
-
- /* filter type */
- if (filter_contents) {
- char cmdline[MAX_LINE_LENGTH];
- FILE *f;
- struct stat sb;
-
- /* no error checking yet -- fix fix! */
- /* get header */
- sprintf(cmdline, "%s -h %s", filter_program, filename);
- f = popen(cmdline, "r");
- if (f == NULL) {
- waislog(WLOG_HIGH, WLOG_ERROR, "couldn't run filter program");
- s_fclose(input_stream);
- return;
- }
- /* get headline */
- fgets(header, MAX_LINE_LENGTH, f);
- header[strlen(header)-1] = '\0'; /* get rid of \n */
- /* get type */
- fgets(newtype, MAX_LINE_LENGTH, f);
- newtype[strlen(newtype)-1] = '\0'; /* get rid of \n */
- /* type = newtype;*/
- strcpy(type, newtype);
- /* get integer: 1 = index contents, 0 = don't */
- fgets(line, MAX_LINE_LENGTH, f);
- index_contents = atoi(line);
- pclose(f);
-
- /* run file through filter and use that as input stream */
- sprintf(cmdline, "%s -f %s", filter_program, filename);
- f = popen(cmdline, "r");
-
- /* get size of file */
- if (stat(filename, &sb) >= 0)
- file_size = sb.st_size;
-
- s_fclose(input_stream);
- input_stream = f;
-
- }
-
- /* write out the filename */
- filename_id = write_filename_table_entry(filename, type, db);
-
- /* (if (not *drop_table*) (make_drop_table)) maybe put in later */
-
- header_flag_1 = NULL;
- the_document_table_entry.filename_id = filename_id;
- the_document_table_entry.start_character = 0;
- the_document_table_entry.document_length = 0;
- the_document_table_entry.number_of_lines = 0;
- the_document_table_entry.date = 0;
-
- while(TRUE){
- long line_length;
- boolean newline_terminated;
- char* read_line_result;
- boolean eof;
-
- /* printf("ftell: %ld\n", ftell(input_stream)); */
- /* read a line */
- read_line_result = fgets(line, MAX_LINE_LENGTH, input_stream);
- beFriendly();
-
- /* eof = feof(input_stream); */ /* zero means not eof */
- eof = !read_line_result;
-
- the_document_table_entry.number_of_lines++;
-
- if(eof ||
- ((NULL != separator_function) &&
- separator_function(line))){
-
- /* we are processing a separator, therefore we should
- * finish off the last document, and start a new one
- */
- if(NULL != finish_header_function){
- finish_header_function(header);
- }
- if(0 == strlen(header)){
- char full_path[1000];
- char directory[1000];
- truename(filename, full_path);
- sprintf(header, "%s %s", pathname_name(full_path),
- pathname_directory(full_path, directory));
- }
- the_document_table_entry.number_of_lines--; /* dont count separator */
- /* finish off the last */
- finish_document(header, line, document_id,
- &the_document_table_entry,
- file_size ? file_size : (
- eof? /* if EOF, use file length */
- file_length(input_stream):file_position_before_line
- ),
- file_position_before_document,
- db, word_position, word_pairs);
- /* initialize the next one */
- the_document_table_entry.filename_id = filename_id;
- the_document_table_entry.start_character = file_position_before_line;
- the_document_table_entry.number_of_lines = 1; /* count separator */
- the_document_table_entry.date = 0;
- file_position_before_document = file_position_before_line;
- header[0] = '\0';
-
- document_id = next_document_id(db);
-
- if(!eof)
- { /* not EOF */
- if(NULL != header_function){
- header_function(line);
- }
- if (date_function != NULL &&
- (date = date_function(line)) > 0)
- the_document_table_entry.date = date;
- line_length = strlen(line);
- newline_terminated = true;
- }
- else{ /* EOF */
- /* printf("closing the file\n"); */
- if (filter_contents)
- pclose(input_stream);
- else
- s_fclose(input_stream);
- return;
- }
- }
-
- else{
- /* not a separator or EOF so process the line */
- long number_of_words;
- if(NULL != header_function) header_function(line);
- if (date_function != NULL &&
- the_document_table_entry.date == 0 &&
- (date = date_function(line)) > 0)
- the_document_table_entry.date = date;
-
-
- if(index_contents ) {
- if( _indexable_section) {
- number_of_words = map_over_words(line, document_id, 1L,
- file_position_before_line -
- file_position_before_document,
- &line_length,
- &newline_terminated,
- db,
- add_word_before_pairs,
- word_position, word_pairs);
- if(number_of_words == -1)
- waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed");
- the_document_table_entry.document_length += number_of_words;
- len_of_files_since_last_delete += number_of_words;
- len_of_files_since_last_flush += number_of_words;
- db->total_word_count += number_of_words;
- }
- else
- newline_terminated = 0;
- }
- }
- if(newline_terminated)
- file_position_before_line += (line_length +
- LENGTH_OF_NEWLINE /* in case of crlf */
- - 1 /* fgets gets one newline */
- );
- else
- file_position_before_line = ftell(input_stream);
-
-
- /* for debugging
- if(file_position_before_line != ftell(input_stream)) {
- waislog(WLOG_LOW, WLOG_INFO, "ftell: %ld, computed ftell: %ld",
- ftell(input_stream),
- file_position_before_line);
- }
- */
-
- }
- }
-
-
-
-
- /* return TRUE if it is a directory, FALSE otherwise */
- boolean directoryp(file)
- char *file;
-
- {
- #ifdef THINK_C
- return(false);
- #else
- struct stat stbuf;
- if(stat(file, &stbuf) == -1)
- return(FALSE);
- if((stbuf.st_mode & S_IFMT) == S_IFDIR)
- return(true);
- return(FALSE);
- #endif
- }
-
- /* return true if it is a file, FALSE otherwise */
- boolean filep(file)
- char *file;
- {
- #ifdef THINK_C
- return(probe_file(file));
- #else
- struct stat stbuf;
- if(stat(file, &stbuf) == -1)
- return(FALSE);
- if(!((stbuf.st_mode & S_IFMT) == S_IFDIR))
- return(true);
- return(FALSE);
- #endif
- }
-
- /* recursively indexes the directory specified.
- * If it is a file, then index it.
- */
- void index_directory(file,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs)
- char *file;
- boolfunc *separator_function;
- voidfunc *header_function;
- longfunc *date_function;
- voidfunc *finish_header_function;
- char *type;
- database* db;
- boolean check_for_text_file;
- boolean check_for_file_already_indexed;
- boolean word_position, word_pairs;
- {
- #ifndef THINK_C
- struct dirent **list;
- long i, j;
-
- if(filep(file)){
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Indexing file: %s", file);
- index_text_file(file, separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs);
- }
- else if(directoryp(file)){
- if ((i = scandir(file, &list, NULL, NULL)) < 0) {
- return;
- }
- for(j = 0; j < i; j++) {
- char name[1000]; /* max filename size */
-
- if(strcmp(list[j]->d_name, ".") == 0
- || strcmp(list[j]->d_name, "..") == 0
- )
- continue;
-
- strcpy(name, file); /* copy the filename into the name variable */
- strcat(name, "/");
- strcat(name, list[j]->d_name);
- index_directory(name, separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs);
- }
- if(list != NULL) {
- for (j = 0; j < i; j++)
- if(list[j] != NULL) free((char *)list[j]);
- free((char *)list);
- }
- #endif /*ndef THINK_C */
- }
- }
-
-
-
-